{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 获取数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导入外部数据  \n",
    "导入数据主要用到pandas的read_x()方法，x表示待导入文件的式"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 导入.xlsx文件  \n",
    "- 用read_excel()方法，传入文件路径即可，避免路径被转义前面加一个**r**\n",
    "- 指定导入那个Sheet可以设定sheet_name参数\n",
    "- 指定行索引index_col参数，默认是从0开始\n",
    "- 指定列索引header参数，默认为0，即第一行作为列索引\n",
    "- 指定导入列usecols参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1</td>\n",
       "      <td>54</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2</td>\n",
       "      <td>16</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A3</td>\n",
       "      <td>47</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A4</td>\n",
       "      <td>41</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A5</td>\n",
       "      <td>51</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>A6</td>\n",
       "      <td>26</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号  年龄 性别       注册时间\n",
       "0  A1  54  男 2018-08-08\n",
       "1  A2  16  女 2018-08-09\n",
       "2  A3  47  女 2018-08-10\n",
       "3  A4  41  男 2018-08-11\n",
       "4  A5  51  女 2018-08-12\n",
       "5  A6  26  男 2018-08-13"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#直接导入\n",
    "import pandas as pd\n",
    "df = pd.read_excel(r\"../Data/Chapter04.xlsx\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1</td>\n",
       "      <td>54</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2</td>\n",
       "      <td>16</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A3</td>\n",
       "      <td>47</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A4</td>\n",
       "      <td>41</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A5</td>\n",
       "      <td>51</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>A6</td>\n",
       "      <td>26</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号  年龄 性别       注册时间\n",
       "0  A1  54  男 2018-08-08\n",
       "1  A2  16  女 2018-08-09\n",
       "2  A3  47  女 2018-08-10\n",
       "3  A4  41  男 2018-08-11\n",
       "4  A5  51  女 2018-08-12\n",
       "5  A6  26  男 2018-08-13"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#导入具体的sheet\n",
    "df = pd.read_excel(r\"../Data/Chapter04.xlsx\",sheet_name = 0)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>编号</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A1</th>\n",
       "      <td>54</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A2</th>\n",
       "      <td>16</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A3</th>\n",
       "      <td>47</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A4</th>\n",
       "      <td>41</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A5</th>\n",
       "      <td>51</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A6</th>\n",
       "      <td>26</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    年龄 性别       注册时间\n",
       "编号                  \n",
       "A1  54  男 2018-08-08\n",
       "A2  16  女 2018-08-09\n",
       "A3  47  女 2018-08-10\n",
       "A4  41  男 2018-08-11\n",
       "A5  51  女 2018-08-12\n",
       "A6  26  男 2018-08-13"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#指定行索引\n",
    "df = pd.read_excel(r\"../Data/Chapter04.xlsx\",sheet_name = 0,index_col = 0)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1</td>\n",
       "      <td>54</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2</td>\n",
       "      <td>16</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A3</td>\n",
       "      <td>47</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A4</td>\n",
       "      <td>41</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A5</td>\n",
       "      <td>51</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>A6</td>\n",
       "      <td>26</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号  年龄 性别       注册时间\n",
       "0  A1  54  男 2018-08-08\n",
       "1  A2  16  女 2018-08-09\n",
       "2  A3  47  女 2018-08-10\n",
       "3  A4  41  男 2018-08-11\n",
       "4  A5  51  女 2018-08-12\n",
       "5  A6  26  男 2018-08-13"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#指定列索引\n",
    "df = pd.read_excel(r\"../Data/Chapter04.xlsx\",sheet_name = 0,header = 0)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>A6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号\n",
       "0  A1\n",
       "1  A2\n",
       "2  A3\n",
       "3  A4\n",
       "4  A5\n",
       "5  A6"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#导入指定的列\n",
    "df = pd.read_excel(r\"../Data/Chapter04.xlsx\",usecols =0)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>54</td>\n",
       "      <td>男</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>16</td>\n",
       "      <td>女</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>47</td>\n",
       "      <td>女</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>41</td>\n",
       "      <td>男</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>51</td>\n",
       "      <td>女</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>26</td>\n",
       "      <td>男</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   年龄 性别\n",
       "0  54  男\n",
       "1  16  女\n",
       "2  47  女\n",
       "3  41  男\n",
       "4  51  女\n",
       "5  26  男"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#导入多列\n",
    "df = pd.read_excel(r\"../Data/Chapter04.xlsx\",sheet_name = 0,usecols =[1,2])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1</td>\n",
       "      <td>54</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2</td>\n",
       "      <td>16</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A3</td>\n",
       "      <td>47</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A4</td>\n",
       "      <td>41</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号  年龄 性别       注册时间\n",
       "0  A1  54  男 2018-08-08\n",
       "1  A2  16  女 2018-08-09\n",
       "2  A3  47  女 2018-08-10\n",
       "3  A4  41  男 2018-08-11"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取中文表名\n",
    "df = pd.read_excel(r\"../Data/Chapter04.xlsx\",sheet_name = \"你好\")\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 导入.csv文件  \n",
    "- read_csv()方法直接导入\n",
    "- sep参数指明分隔符\n",
    "- nrows参数指明读取行\n",
    "- engineing参数指定文件编码格式\n",
    "- 如果文件路径含有中文需要设定engine\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1</td>\n",
       "      <td>54</td>\n",
       "      <td>男</td>\n",
       "      <td>2018/8/8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2</td>\n",
       "      <td>16</td>\n",
       "      <td>女</td>\n",
       "      <td>2018/8/9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A3</td>\n",
       "      <td>47</td>\n",
       "      <td>女</td>\n",
       "      <td>2018/8/10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A4</td>\n",
       "      <td>41</td>\n",
       "      <td>男</td>\n",
       "      <td>2018/8/11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号  年龄 性别       注册时间\n",
       "0  A1  54  男   2018/8/8\n",
       "1  A2  16  女   2018/8/9\n",
       "2  A3  47  女  2018/8/10\n",
       "3  A4  41  男  2018/8/11"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#直接导入\n",
    "df = pd.read_csv(r\"../Data/Chapter04.csv\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1</td>\n",
       "      <td>54</td>\n",
       "      <td>男</td>\n",
       "      <td>2018/8/8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2</td>\n",
       "      <td>16</td>\n",
       "      <td>女</td>\n",
       "      <td>2018/8/9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A3</td>\n",
       "      <td>47</td>\n",
       "      <td>女</td>\n",
       "      <td>2018/8/10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A4</td>\n",
       "      <td>41</td>\n",
       "      <td>男</td>\n",
       "      <td>2018/8/11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号  年龄 性别       注册时间\n",
       "0  A1  54  男   2018/8/8\n",
       "1  A2  16  女   2018/8/9\n",
       "2  A3  47  女  2018/8/10\n",
       "3  A4  41  男  2018/8/11"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#指定分隔符\n",
    "df = pd.read_csv(r\"../Data/Chapter04.1.csv\",sep=\" \")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1</td>\n",
       "      <td>54</td>\n",
       "      <td>男</td>\n",
       "      <td>2018/8/8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2</td>\n",
       "      <td>16</td>\n",
       "      <td>女</td>\n",
       "      <td>2018/8/9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号  年龄 性别      注册时间\n",
       "0  A1  54  男  2018/8/8\n",
       "1  A2  16  女  2018/8/9"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#指定读取行数\n",
    "df = pd.read_csv(r\"../Data/Chapter04.csv\",nrows =2)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 导入.txt文件  \n",
    "- 用read_table()方法读取\n",
    "- sep参数指定分隔符\n",
    "- 也可以读取csv文件\n",
    "- 其他参数与read_csv()方法基本一致"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1</td>\n",
       "      <td>54</td>\n",
       "      <td>男</td>\n",
       "      <td>2018/8/8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2</td>\n",
       "      <td>16</td>\n",
       "      <td>女</td>\n",
       "      <td>2018/8/9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A3</td>\n",
       "      <td>47</td>\n",
       "      <td>女</td>\n",
       "      <td>2018/8/10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A4</td>\n",
       "      <td>41</td>\n",
       "      <td>男</td>\n",
       "      <td>2018/8/11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号  年龄 性别       注册时间\n",
       "0  A1  54  男   2018/8/8\n",
       "1  A2  16  女   2018/8/9\n",
       "2  A3  47  女  2018/8/10\n",
       "3  A4  41  男  2018/8/11"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#指定读取行数\n",
    "df = pd.read_csv(r\"../Data/Chapter04.txt\",sep =\",\")\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 导入sql文件\n",
    "- 连接数据库\n",
    "- 执行SQL查询"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>migration</th>\n",
       "      <th>batch</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9</td>\n",
       "      <td>2014_10_12_000000_create_users_table</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10</td>\n",
       "      <td>2014_10_12_100000_create_password_resets_table</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>11</td>\n",
       "      <td>2019_01_08_103331_create_banners_table</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12</td>\n",
       "      <td>2019_01_10_102434_create_tags_table</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>13</td>\n",
       "      <td>2019_01_11_134855_create_articles_table</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>14</td>\n",
       "      <td>2019_01_11_135017_create_contents_table</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>15</td>\n",
       "      <td>2019_01_11_135138_create_article_tags_table</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>16</td>\n",
       "      <td>2019_01_14_124801_create_settings_table</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                       migration  batch\n",
       "0   9            2014_10_12_000000_create_users_table      1\n",
       "1  10  2014_10_12_100000_create_password_resets_table      1\n",
       "2  11          2019_01_08_103331_create_banners_table      1\n",
       "3  12             2019_01_10_102434_create_tags_table      1\n",
       "4  13         2019_01_11_134855_create_articles_table      1\n",
       "5  14         2019_01_11_135017_create_contents_table      1\n",
       "6  15     2019_01_11_135138_create_article_tags_table      1\n",
       "7  16         2019_01_14_124801_create_settings_table      1"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#导入pymysql模块\n",
    "import pymysql\n",
    "import pandas as pd\n",
    "\n",
    "#创建连接\n",
    "eng = pymysql.connect(host='localhost',\n",
    "                      user='root',\n",
    "                      password='',\n",
    "                      db='new_website',\n",
    "                      charset='utf8')\n",
    "sql= 'SELECT * FROM lara_migrations'\n",
    "df = pd.read_sql(sql,eng,)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 新建数据  \n",
    "利用pd.DataFrame()方法进行新建"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 熟悉数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 利用head预览前几行  \n",
    "head()方法如果不传参数，默认显示**前5行**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1</td>\n",
       "      <td>54</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2</td>\n",
       "      <td>16</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A3</td>\n",
       "      <td>47</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A4</td>\n",
       "      <td>41</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A5</td>\n",
       "      <td>51</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号  年龄 性别       注册时间\n",
       "0  A1  54  男 2018-08-08\n",
       "1  A2  16  女 2018-08-09\n",
       "2  A3  47  女 2018-08-10\n",
       "3  A4  41  男 2018-08-11\n",
       "4  A5  51  女 2018-08-12"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel(r\"../Data/Chapter04.xlsx\")\n",
    "df\n",
    "#默认显示\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1</td>\n",
       "      <td>54</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2</td>\n",
       "      <td>16</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号  年龄 性别       注册时间\n",
       "0  A1  54  男 2018-08-08\n",
       "1  A2  16  女 2018-08-09"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#显示前2行\n",
    "df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 利用shape获取数据表的大小  \n",
    "Python中shape方法获取数据表的行和列数**(注意没括号)**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(6, 4)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel(r\"../Data/Chapter04.xlsx\")\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 利用info获取数据类型  \n",
    "Python中info()方法查看数据表的中的数据类型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 6 entries, 0 to 5\n",
      "Data columns (total 4 columns):\n",
      "编号      6 non-null object\n",
      "年龄      6 non-null int64\n",
      "性别      6 non-null object\n",
      "注册时间    6 non-null datetime64[ns]\n",
      "dtypes: datetime64[ns](1), int64(1), object(2)\n",
      "memory usage: 272.0+ bytes\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 利用describe获取数值分布情况\n",
    "Python中describe()方法可以获取所有数值类型字段的分布值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>年龄</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>6.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>39.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>15.065413</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>16.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>29.750000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>44.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>50.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>54.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              年龄\n",
       "count   6.000000\n",
       "mean   39.166667\n",
       "std    15.065413\n",
       "min    16.000000\n",
       "25%    29.750000\n",
       "50%    44.000000\n",
       "75%    50.000000\n",
       "max    54.000000"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "第4章  获取数据源",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "320px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
